#chargement des librairies
library(randomForest)
## Warning: le package 'randomForest' a été compilé avec la version R 4.2.2
## randomForest 4.7-1.1
## Type rfNews() to see new features/changes/bug fixes.
library(gbm)
## Warning: le package 'gbm' a été compilé avec la version R 4.2.2
## Loaded gbm 2.1.8.1
library(mgcv)
## Warning: le package 'mgcv' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : nlme
## This is mgcv 1.8-41. For overview type 'help("mgcv-package")'.
library(party)
## Warning: le package 'party' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : grid
## Le chargement a nécessité le package : mvtnorm
## Le chargement a nécessité le package : modeltools
## Le chargement a nécessité le package : stats4
## Le chargement a nécessité le package : strucchange
## Warning: le package 'strucchange' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : zoo
## Warning: le package 'zoo' a été compilé avec la version R 4.2.2
##
## Attachement du package : 'zoo'
## Les objets suivants sont masqués depuis 'package:base':
##
## as.Date, as.Date.numeric
## Le chargement a nécessité le package : sandwich
## Warning: le package 'sandwich' a été compilé avec la version R 4.2.2
library(electBook)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(forecast)
## Warning: le package 'forecast' a été compilé avec la version R 4.2.2
##
## Attachement du package : 'forecast'
## L'objet suivant est masqué depuis 'package:nlme':
##
## getResponse
#chargment de données
df<-read.csv("df_ajout_var_calendar.csv",header=T,stringsAsFactors = T)
df<-df
#View(head(df))
str(df)
## 'data.frame': 455520 obs. of 12 variables:
## $ DateHeure : Factor w/ 35040 levels "2021-01-01 00:00:00+00:00",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ region : Factor w/ 13 levels "Auvergne-Rhône-Alpes",..: 1 12 11 3 2 10 9 13 8 7 ...
## $ conso : int 8576 6395 5840 3984 2671 6512 4326 4230 9914 6340 ...
## $ temp : num 276 277 278 274 275 ...
## $ heure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Factor w/ 730 levels "2021-01-01","2021-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ period_journey: int 0 0 0 0 0 0 0 0 0 0 ...
## $ isHolyday : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ weekday : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Weekend : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ Mois : int 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
df$DateHeure<-as.Date(df$DateHeure,format = "%Y-%m-%d %H:%M:%S")
#5 premières lignes
head(df)
## DateHeure region conso temp heure date
## 1 2021-01-01 Auvergne-Rhône-Alpes 8576 276.1250 0 2021-01-01
## 2 2021-01-01 PACA 6395 277.2833 0 2021-01-01
## 3 2021-01-01 Occitanie 5840 278.4167 0 2021-01-01
## 4 2021-01-01 Bretagne 3984 273.9833 0 2021-01-01
## 5 2021-01-01 Bourgogne-Franche-Comté 2671 275.1500 0 2021-01-01
## 6 2021-01-01 Nouvelle-Aquitaine 6512 275.1500 0 2021-01-01
## period_journey isHolyday weekday Weekend Mois year
## 1 0 True 4 False 1 2021
## 2 0 True 4 False 1 2021
## 3 0 True 4 False 1 2021
## 4 0 True 4 False 1 2021
## 5 0 True 4 False 1 2021
## 6 0 True 4 False 1 2021
#info sur les colonnes
str(df)
## 'data.frame': 455520 obs. of 12 variables:
## $ DateHeure : Date, format: "2021-01-01" "2021-01-01" ...
## $ region : Factor w/ 13 levels "Auvergne-Rhône-Alpes",..: 1 12 11 3 2 10 9 13 8 7 ...
## $ conso : int 8576 6395 5840 3984 2671 6512 4326 4230 9914 6340 ...
## $ temp : num 276 277 278 274 275 ...
## $ heure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Factor w/ 730 levels "2021-01-01","2021-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ period_journey: int 0 0 0 0 0 0 0 0 0 0 ...
## $ isHolyday : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ weekday : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Weekend : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ Mois : int 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
str(df)
## 'data.frame': 455520 obs. of 12 variables:
## $ DateHeure : Date, format: "2021-01-01" "2021-01-01" ...
## $ region : Factor w/ 13 levels "Auvergne-Rhône-Alpes",..: 1 12 11 3 2 10 9 13 8 7 ...
## $ conso : int 8576 6395 5840 3984 2671 6512 4326 4230 9914 6340 ...
## $ temp : num 276 277 278 274 275 ...
## $ heure : int 0 0 0 0 0 0 0 0 0 0 ...
## $ date : Factor w/ 730 levels "2021-01-01","2021-01-02",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ period_journey: int 0 0 0 0 0 0 0 0 0 0 ...
## $ isHolyday : Factor w/ 2 levels "False","True": 2 2 2 2 2 2 2 2 2 2 ...
## $ weekday : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Weekend : Factor w/ 2 levels "False","True": 1 1 1 1 1 1 1 1 1 1 ...
## $ Mois : int 1 1 1 1 1 1 1 1 1 1 ...
## $ year : int 2021 2021 2021 2021 2021 2021 2021 2021 2021 2021 ...
df$DateHeure<-as.Date(df$DateHeure,format = "%Y-%m-%d %H:%M:%S")
#df[['DateHeure']] <- as.POSIXct(df[['DateHeure']],format = "%Y-%m-%d %H:%M:%S")
df=subset(df ,region=="France")
df$Weekend<-as.logical(df$Weekend)
df$Weekend <- as.integer(df$Weekend)
df$isHolyday<-as.logical(df$isHolyday)
df$isHolyday <- as.integer(df$isHolyday)
col=c("DateHeure","conso","temp","heure","period_journey","isHolyday"
,"weekday","Weekend","Mois")
#selection des variable
df_france=df[col]
#praparation des données
last <- tail(seq(nrow(df_france)), 48)
df_france$conso48 <- c(rep(NA, 48), df_france$conso[-last])
#separataion data
n_test <- 48
n_train<-nrow(df_france)-n_test
df_train <- head(df_france,n_train)
df_test <- tail(df_france,48)
#visualisation
plot(df_france$DateHeure,
df_france$conso, type = 'n')
lines(df_train$DateHeure, df_train$conso)
lines(df_test$DateHeure, df_test$conso, col = 2)

#Random Forest
formula_ML <- conso ~ temp + heure + period_journey + isHolyday +
weekday + Weekend + Mois
expert_rf <- randomForest(formula_ML, data = na.omit(df_train))
expert_rf_forecast <- predict(expert_rf, newdata=df_test)
mean(expert_rf_forecast)
## [1] 51819.14
print(sqrt(mean((expert_rf_forecast-df_test[,"conso"])**2)))
## [1] 7335.336
print(sqrt(mean((expert_rf_forecast-df_test[,"conso"])**2)))
## [1] 7335.336
#auto-arima sans covariables
ts.train <- ts(df_train, frequency = 7)
ts.test <- ts(ts.train, frequency = 7)
fit1=auto.arima(df_train[,"conso"])
forecast_auto_arima1=forecast(fit1,h=48)
forecast_auto_arima1 <- forecast_auto_arima1$mean
forecast_auto_arima1
## Time Series:
## Start = 34993
## End = 35040
## Frequency = 1
## [1] 48803.92 48900.33 49027.94 49194.22 49354.01 49517.65 49670.22 49812.47
## [9] 49941.31 50056.74 50158.85 50248.36 50326.21 50393.47 50451.26 50500.67
## [17] 50542.73 50578.40 50608.56 50633.98 50655.35 50673.27 50688.26 50700.78
## [25] 50711.22 50719.90 50727.11 50733.10 50738.06 50742.17 50745.56 50748.36
## [33] 50750.67 50752.57 50754.14 50755.43 50756.49 50757.36 50758.08 50758.67
## [41] 50759.15 50759.54 50759.86 50760.13 50760.34 50760.52 50760.66 50760.78
print(sqrt(mean((forecast_auto_arima1-df_test[,"conso"])**2)))
## [1] 6313.171
#auto-arima avec covariable
#### objet ts ###################
date_start=as.Date("2021-01-01")
date_end=as.Date("2022-12-31")
#ts.data <- ts(data = data, start =date_start, end =date_end,frequency=48)
ts.train <- ts(df_train, frequency = 7)
ts.test <- ts(df_test, frequency = 7)
fit2=auto.arima(df_train[,"conso"],xreg=as.matrix(df_train[,3:9]))
forecast_auto_arima2<-forecast(fit2,h=48,xreg=as.matrix(df_test[,3:9]))
forecast_auto_arima2 <- forecast_auto_arima2$mean
print(sqrt(mean((forecast_auto_arima2-df_test[,"conso"])**2)))
## [1] 7240.465
library(prophet)
## Warning: le package 'prophet' a été compilé avec la version R 4.2.2
## Le chargement a nécessité le package : Rcpp
## Le chargement a nécessité le package : rlang
# charger les donnée
# préparer les données pour Prophet
df_prophet=df_france
df_prophet$ds <-df_prophet$DateHeure
df_prophet$y <- df_prophet$conso
df_prophet$holiday <- ifelse(df_prophet$isHolyday == "yes", 1, 0)
# inclure les variables exogènes
str(df_prophet)
## 'data.frame': 35040 obs. of 13 variables:
## $ DateHeure : Date, format: "2021-01-01" "2021-01-01" ...
## $ conso : int 67010 67071 65052 64918 64376 63758 61883 60469 59103 58441 ...
## $ temp : num 275 275 275 275 274 ...
## $ heure : int 0 0 1 1 2 2 3 3 4 4 ...
## $ period_journey: int 0 0 0 0 0 0 0 0 0 0 ...
## $ isHolyday : int 1 1 1 1 1 1 1 1 1 1 ...
## $ weekday : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Weekend : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Mois : int 1 1 1 1 1 1 1 1 1 1 ...
## $ conso48 : int NA NA NA NA NA NA NA NA NA NA ...
## $ ds : Date, format: "2021-01-01" "2021-01-01" ...
## $ y : int 67010 67071 65052 64918 64376 63758 61883 60469 59103 58441 ...
## $ holiday : num 0 0 0 0 0 0 0 0 0 0 ...
df_prophet<-df_prophet[,setdiff(colnames(df_prophet),c("DateHeure","conso"))]
summary(df_prophet)
## temp heure period_journey isHolyday
## Min. :270.3 Min. : 0.00 Min. :0.0000 Min. :0.00000
## 1st Qu.:280.8 1st Qu.: 5.75 1st Qu.:0.0000 1st Qu.:0.00000
## Median :285.6 Median :11.50 Median :0.5000 Median :0.00000
## Mean :286.0 Mean :11.50 Mean :0.8333 Mean :0.03014
## 3rd Qu.:291.0 3rd Qu.:17.25 3rd Qu.:1.2500 3rd Qu.:0.00000
## Max. :309.5 Max. :23.00 Max. :3.0000 Max. :1.00000
##
## weekday Weekend Mois conso48
## Min. :0.000 Min. :0.0000 Min. : 1.000 Min. :29660
## 1st Qu.:1.000 1st Qu.:0.0000 1st Qu.: 4.000 1st Qu.:43378
## Median :3.000 Median :0.0000 Median : 7.000 Median :50001
## Mean :3.004 Mean :0.2863 Mean : 6.526 Mean :52163
## 3rd Qu.:5.000 3rd Qu.:1.0000 3rd Qu.:10.000 3rd Qu.:59978
## Max. :6.000 Max. :1.0000 Max. :12.000 Max. :88440
## NA's :48
## ds y holiday
## Min. :2021-01-01 Min. :29660 Min. :0
## 1st Qu.:2021-07-02 1st Qu.:43378 1st Qu.:0
## Median :2021-12-31 Median :49988 Median :0
## Mean :2021-12-31 Mean :52153 Mean :0
## 3rd Qu.:2022-07-02 3rd Qu.:59969 3rd Qu.:0
## Max. :2022-12-31 Max. :88440 Max. :0
##
# entraîner le modèle
m <- prophet(df_prophet,yearly.seasonality=TRUE,
weekly.seasonality = TRUE,holidays = df_prophet[df_prophet$holiday == 1,
c("ds", "holiday")])
## Disabling daily seasonality. Run prophet with daily.seasonality=TRUE to override this.
# faire des prévisions
future <- make_future_dataframe(m, periods = 1)
forecast_prophet <- predict(m, future)
# afficher les prévisions
plot(m, forecast_prophet)

forecast_prophet<-tail(forecast_prophet$yhat,48)
######## méthodes tslm ################
#regardons le le modèle tslm en incluant la saisonnalité
formula_tslm <- conso ~ temp + heure + period_journey + isHolyday +
weekday + Weekend + Mois
fit_tslm=tslm(formula_tslm,data=ts.train)
forecast_tslm<-predict(fit_tslm, newdata =ts.test[,3:9])
print(sqrt(mean((forecast_tslm-df_test[,"conso"])**2)))
## [1] 5613.051
########## methodes par aggrégations des poids des experts##########
# 2. Mixture ####
library(opera)
## Warning: le package 'opera' a été compilé avec la version R 4.2.2
experts <- cbind(expert_rf_forecast,forecast_auto_arima1,forecast_auto_arima2,
forecast_tslm,forecast_prophet)
colnames(experts) <- c("rf","arima_sans_covariable","arima_avec_covariable",
"tslm","prophet")
or <- oracle(df_test$conso,experts,
model = "convex",
loss.type = "square")
rmse_exp <- apply(experts, 2,
function(x){sqrt(mean((x - df_test$conso)^2))})
rmse_exp %>% round(, digits = 0) %>% sort
## tslm arima_sans_covariable arima_avec_covariable
## 5613 6313 7240
## rf prophet
## 7335 10879
# valeur théorique
M <- mean((df_train$conso - df_train$conso48)^2, na.rm = T)
learning.rate <- (1/M) * sqrt(8*log(ncol(experts))) / nrow(df_test)
agg.online_theoric<- mixture(Y = df_test$conso,
experts = experts,
model = 'EWA',
loss.type = "square",
loss.gradient = F,
parameter=list(eta=learning.rate))
plot(agg.online_theoric)
## Warning in par(def.par, new = FALSE): argument 1 does not name a graphical
## parameter
summary(agg.online_theoric)
## Aggregation rule: EWA
## Loss function: squareloss
## Gradient trick: FALSE
## Coefficients:
## rf arima_sans_covariable arima_avec_covariable tslm prophet
## 0.141 0.28 0.151 0.422 0.00598
##
## Number of experts: 5
## Number of observations: 48
## Dimension of the data: 1
##
## rmse mape
## EWA 6450 0.138
## Uniform 7150 0.153
# optimisation sur les données
agg.online<- mixture(Y = df_test$conso , experts = experts,
model = 'EWA', loss.type = "square",
loss.gradient = F)
plot(agg.online)
## Warning in par(def.par, new = FALSE): argument 1 does not name a graphical
## parameter
summary(agg.online)
## Aggregation rule: EWA
## Loss function: squareloss
## Gradient trick: FALSE
## Coefficients:
## rf arima_sans_covariable arima_avec_covariable tslm prophet
## 0 0 0 1 0
##
## Number of experts: 5
## Number of observations: 48
## Dimension of the data: 1
##
## rmse mape
## EWA 5680 0.117
## Uniform 7150 0.153